/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	phat_predictors.dta
			pdb_full_structures.dta
			pdb_full_papers.dta

Output: 	Table E3

Purpose: 	Boostrap the SEs for our main results in Table 2, taking into
			account additional uncertainty from estimating P

*******************************************************************************/

clear all
set maxvar 32767
set seed 410

* Set bootstrap repetitions (200)
local reps = 200

/*------------------------------------------------------------------------------

	Table E3: Effect of Potential on Competition, Maturation, Quality
			 (Bootstrapped Standard Errors)

------------------------------------------------------------------------------*/

/******************************************************************************

Step 1: Sample from the data and run the LASSO procedure repeatedly.
		Save the predicted citations from each iteration
		
*******************************************************************************/	

* Generate empty .dta file to store reps
set obs 1
gen structureId = ""
gen predPtileCites3_boot = .
gen boot = .
save "${data_built}bootstrap/boot_phat_draws_temp.dta", replace

* Loop over bootstrap replications
forval boot = 1/`reps' {
	use "${data_built}bootstrap/phat_predictors.dta", clear

	* Sample the data with replacement 
	bsample _N 

	* Lasso estimation
	rlasso ptileCites3 i.groupClassification i.groupMacroMolecule 			///
	i.groupTaxonomy i.groupGene before_pdb before_pdb2 i.publicationYear,   ///
	prestd center ols robust
 
	* Prediction (truncate predictions at 0 / 100)
	predict predPtileCites3_boot, ols 
	replace predPtileCites3_boot = 100 if predPtileCites3_boot > 100
	replace predPtileCites3_boot = 0 if predPtileCites3_boot < 0
	
	* Mark the bootstrap replication number
	gen boot = `boot'
	
	keep structureId predPtileCites3_boot boot
	
	* Append to the previous replications and save
	append using "${data_built}bootstrap/boot_phat_draws_temp.dta"
	save "${data_built}bootstrap/boot_phat_draws_temp.dta", replace	
}	

erase "${data_built}bootstrap/boot_phat_draws_temp.dta"
save "${data_built}bootstrap/boot_phat_draws.dta", replace		
	
/*******************************************************************************

Step 2: Estimate the regressions on each dataset of predicted citations

*******************************************************************************/	 	
	
* Assemble data
clear all
use "${data_built}pdb_full_structures.dta"
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
keep if _merge == 3
drop _merge

merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
keep if _merge != 2
drop _merge

* Generate a last author fixed effect
encode lastAuthorId, gen(lastAuthorId_num)

* Save this assembled data
tempfile data
save `data'
	
* Initialize matrix to store results from bootstrap
matrix boot_results = J(`reps',10,.)
local row = 1
local col = 1	

* Loop over bootstrap replications		
forval boot = 1/`reps' {	
	
	* Grab the predicted p-hats from the relevant bootstrap iteration
 	use "${data_built}bootstrap/boot_phat_draws.dta", clear
	keep if boot == `boot'
	drop boot
	
	* Merge on to the assembled data
	merge m:1 structureId using `data', keep(3) nogen
		* Mental check: ~8,000 out of 21,951 don't merge back each time 
		* ie those structures were not drawn in that  particular bootstrap rep
		* Probability any given structure not drawn is (1-1/21951)^21951 = .367
		* And .367 * 21,951 = 8075.14, so that checks out
	
	rename predPtileCites3_boot predPtileCites3
	
	* Only keep non-SG structures
	keep if structuralGenomics == 0	

	
 * Competition (Priority race)
	reg priorityRace predPtileCites3 $time_controls, r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col
		
	reg priorityRace predPtileCites3 $time_controls $complexity_controls, r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col	
	
 * Maturation
	reg maturation predPtileCites3 $time_controls, r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col
		
	reg maturation predPtileCites3 $time_controls $complexity_controls, r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col	 
	
	areg maturation predPtileCites3 $time_controls, absorb(lastAuthorId_num) r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col	 
		
	areg maturation predPtileCites3 $time_controls $complexity_controls, 	///
	absorb(lastAuthorId_num) r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col	 	
	
 * Quality (std. index)
	reg indexStd predPtileCites3 $time_controls, r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col
		
	reg indexStd predPtileCites3 $time_controls $complexity_controls, r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col	 
	
	areg indexStd predPtileCites3 $time_controls, absorb(lastAuthorId_num) r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col	 
		
	areg indexStd predPtileCites3 $time_controls $complexity_controls, 		///
	absorb(lastAuthorId_num) r
		matrix boot_results[`row',`col'] = _b[predPtileCites3]
		local ++col	 	 

	local ++row
	local col = 1
} 
 
clear
svmat boot_results
save "${data_built}bootstrap/boot_estimates.dta", replace
 
/*******************************************************************************

Step 3: Export bootstrap results to table

*******************************************************************************/	 

* Initialize matrix to store results
matrix output = J(10,5,.)
local row = 1
local col = 1	 

* Set .025 and .0975 cutoffs depending on number of reps
	local ci95_l = round(`reps'/40)
		if `ci95_l' == 0 {
			local ci95_l = 1
		}  // just a little check for testing low number of reps
	local ci95_h = `reps'-`ci95_l'+1
	di `ci95_l'
	di `ci95_h'
 
use "${data_built}bootstrap/boot_estimates.dta", clear
 
* Loop over the different regression models
forval model = 1/5 {
 	
	* Regressions without complexity controls
		
		local ii = `model'*2 - 1
		sort boot_results`ii' 
		
		* Mean and SD across replications		
		sum boot_results`ii' 
		matrix output[`row',`col'] = r(mean)
		local ++row
		matrix output[`row',`col'] = r(sd)
		local ++row
			 
		* Numerical confidence intervals 	
		matrix output[`row',`col'] = boot_results`ii'[`ci95_l'] 
		local ++row
		matrix output[`row',`col'] = boot_results`ii'[`ci95_h'] 
		local ++row
					
		* Implied p-values	
		if r(mean) > 0 {
			count if boot_results`ii' < 0
			matrix output[`row',`col'] = r(N)/`reps'
		}
		if r(mean) <= 0 {
			count if boot_results`ii' > 0
			matrix output[`row',`col'] = r(N)/`reps'
		}
	
	* Regressions with complexity controls
			
		local ++row	
		local ++ii
 
		* Mean and SD across replications		
		sum boot_results`ii' 
		matrix output[`row',`col'] = r(mean)
		local ++row
		matrix output[`row',`col'] = r(sd)
		local ++row
 
		* Numerical confidence intervals 		
		matrix output[`row',`col'] = boot_results`ii'[`ci95_l'] 
		local ++row
		matrix output[`row',`col'] = boot_results`ii'[`ci95_h'] 
		local ++row
		
		* Implied p-values		
		if r(mean) > 0 {
			count if boot_results`ii' < 0
			matrix output[`row',`col'] = r(N)/`reps'
		}
		if r(mean) <= 0 {
			count if boot_results`ii' > 0
			matrix output[`row',`col'] = r(N)/`reps'
		}
		local row = 1
		local ++col
 }
 
* Output results 
clear
svmat output
export excel "${tables}Tables.xlsx", sheet(tableE3) cell(D60) sheetmodify

* Erase intermediate files
erase "${data_built}bootstrap/boot_phat_draws.dta"
erase "${data_built}bootstrap/boot_estimates.dta"
 
